
import json
import click
import numpy as np
import pandas as pd
from sklearn.preprocessing import scale
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.model_selection import GridSearchCV

################################################################################

def get_feature_name(feature_set_name):

    if feature_set_name == "narrow":
        return [
            "location_boston",
            "location_arizona",
            "location_texas",
            "location_nyc",
            "room_theater",
            "room_dig",
            "room_office",
            "room_home",
            "room_apartment",
            "room_agency",
            "num_ppl",
            "pos_mst_avg",
            "age_mean",
            "age_std",
            "gender_entropy",
            "p_glass",
            "race_entropy",
            "smile_coef_mean",
            "smile_coef_std"
        ]

    elif feature_set_name == "wide":
        return [
            "location_boston",
            "location_arizona",
            "location_texas",
            "location_nyc",
            "room_theater",
            "room_dig",
            "room_office",
            "room_home",
            "room_apartment",
            "room_agency",
            "num_ppl",
            "pos_mst_avg",
            "age_mean",
            "age_std",
            "gender_NA",
            "gender_female",
            "gender_male",
            "glass_NA",
            "glass_dark",
            "glass_none",
            "glass_normal",
            "race_NA",
            "race_asian",
            "race_black",
            "race_white",
            "smile_coef_mean",
            "smile_coef_std",
            "gender_entropy",
            "p_glass",
            "race_entropy"
        ]

################################################################################

def make_classifier_list(inner_cv, verbose=False):
    # NB: using random_state=0 to ensure replicability

    dummy = DummyClassifier(strategy="most_frequent")

    log_reg = LogisticRegression()

    naive_bayes = GaussianNB()

    svms = GridSearchCV(
        estimator=svm.LinearSVC(random_state=0),
        param_grid={"C": [10.0, 1.0, 0.1, 0.01]},
        cv=inner_cv,
        scoring="roc_auc"
    )

    random_forests = GridSearchCV(
        estimator = RandomForestClassifier(random_state=0),
        param_grid = {"n_estimators": [10, 100, 1000, 10000]},
        cv = inner_cv,
        scoring="roc_auc"
    )

    grad_boosts = GridSearchCV(
        estimator = GradientBoostingClassifier(random_state=0),
        param_grid = {"n_estimators": [100, 1000, 10000]},
        cv = inner_cv,
        scoring="roc_auc"
    )

    clfs = [
        dummy,
        log_reg,
        naive_bayes,
        svms,
        random_forests,
        grad_boosts
    ]

    clf_names = [
        "dummy_most_freq",
        "logistic_regression",
        "naive bayes",
        "linear_svm",
        "random_forest",
        "gradient_boosting"
    ]

    if verbose:
        for clf_name, clf in zip(clf_names, clfs):
            print clf_name, clf

    return clfs, clf_names

################################################################################
#
# MAIN
#
################################################################################

@click.command()
@click.option('--root', default="../../data/")
@click.option('--feature_set_name', default="wide")
@click.option('--n_folds', default=10)
@click.option('--scaling_on/--scaling_off', default=False)
def main(root, feature_set_name, n_folds, scaling_on):

    # example:
    # python test_classifiers.py --feature_set_name wide --scaling_off

    # print parameters
    print "-------------------------------------------------------------------"
    print "root:", root
    print "feature_set_name:", feature_set_name
    print "scaling_on:", scaling_on
    print "n_folds:", n_folds
    print "-------------------------------------------------------------------"

    #
    # Read Data
    #
    df = pd.read_csv("%s/features_labels_wide.csv" % root)
    feature_names = get_feature_name(feature_set_name)
    X = df.as_matrix(columns=feature_names)
    y_str = df['escaped'].values
    y = (y_str == "Y").flatten().astype(int)

    if scaling_on:
        X = scale(X)

    print "Read data ---"
    print "X.shape:", X.shape
    print "y.shape:", y.shape
    print "-------------"

    #
    # Run Stratified K-fold Crossvalidation
    #
    all_results = {}

    inner_cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=0)
    outer_cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=0)

    # Make classifier objects
    clfs, clf_names = make_classifier_list(inner_cv)
    print "Created %d classifiers" % len(clfs)

    # loop through the classifiers and run x-validation for each
    for clf, clf_name in zip(clfs, clf_names):
        res = cross_validate(
            clf,
            X,
            y,
            cv=outer_cv,
            scoring=["roc_auc", "accuracy"],
            return_train_score=True,
            n_jobs=10
        )

        all_results[clf_name] = {k: v.tolist() for k, v in res.items()}

        # print results
        """
        print "%-25s | AUC | train: %.3f (%.3f) | test: %.3f (%.3f)" % (
                clf_name,
                res["train_roc_auc"].mean(), res["train_roc_auc"].std(),
                res["test_roc_auc"].mean(), res["test_roc_auc"].std())
        """
        print "%-25s | ACC | train: %.3f (%.3f) | test: %.3f (%.3f)" % (
                clf_name,
                res["train_accuracy"].mean(), res["train_accuracy"].std(),
                res["test_accuracy"].mean(), res["test_accuracy"].std())

    # save files to json
    all_results["settings"] = {
        "feature_set_name": feature_set_name,
        "scaling_on": scaling_on,
        "n_folds": n_folds
    }

    json.dump(
        all_results,
        open(
            "%s/pred_res_nested/pred_%s_%s.json" %
                (
                    root,
                    feature_set_name,
                    "scaled" if scaling_on else "not_scaled"
                ),
            "w"
        ),
        indent=2
    )

    print "Done!"

################################################################################

if __name__ == "__main__":
    main()

# END
